## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'plotly'
## 
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## 
## The following object is masked from 'package:graphics':
## 
##     layout

import NHANES datafiles

Imported demographic, HIV test, and sexual behavior dataframes 1999 - 2016.

demo_list = 
  list.files(path = "NHANES/DEMO", full.names = TRUE) 
hiv_list = 
  list.files(path = "NHANES/HIV", full.names = TRUE) 
sxq_list = 
  list.files(path = "NHANES/SXQ", full.names = TRUE) 


load_files = function(x){
  
  filetype =
    str_extract(x, "DEMO|HIV|SXQ")
  
  year = 
    str_extract(x, "(?<=_)[A-Z]")
  
data = 
    read_xpt(x) |>
    janitor::clean_names()|>
    mutate(filetype = filetype, year = year)
    
}


demo_list_output = 
  map(demo_list, load_files) 

hiv_list_output = 
  map(hiv_list, load_files) 

sxq_list_output = 
  map(sxq_list, load_files)

join_pair = function(df1, df2) {
  left_join(df1, df2, by = c("seqn", "year"))
}


hiv_demo_list = 
  map2(hiv_list_output, demo_list_output, ~join_pair(.x, .y))
  
hiv_demo_sxq_df = 
  map2(hiv_demo_list, sxq_list_output, ~join_pair(.x, .y)) |>
  bind_rows() 

Re-coded variables of interest: Restricted particpants’s age between 20 and 49

cleaned_joined_df = 
  hiv_demo_sxq_df |>
  filter(ridageyr >= 20 & ridageyr <= 49) |>
  mutate(hiv = case_when(
    lbdhi == 1 | lbxhivc == 1 ~ 1,
    lbdhi == 2 | lbxhivc == 2 ~ 2)
    )|> 
  mutate(MSM = case_when(
    year %in% c("A","B","C") ~ sxq220,
    year %in% c("D","E","F","G","H","I") ~ sxq550)
    )|>
   mutate(WSW = case_when(
    year %in% c("A","B","C") ~ sxq150,
    year %in% c("D","E","F","G","H","I") ~ sxq490)
    )|>
  mutate(year = recode(year, 
                       "A" = "1999-2000",
                       "B" = "2001-2002",
                       "C" = "2003-2004",
                       "D" = "2005-2006",
                       "E" = "2007-2008",
                       "F" = "2009-2010",
                       "G" = "2011-2012",
                       "H" = "2013-2014",
                       "I" = "2015-2016"),
         hiv = recode(hiv,
                            "1" = "Reactive",
                            "2" = "Non-reactive"),
         gender = recode(riagendr, 
                         "1" = "Male",
                         "2" = "Female"),
         age = ridageyr,
         race = recode(ridreth1,
                       "1" = "Mexican American",
                       "2" = "Other Hispanic",
                       "3" = "Non-Hispanic White",
                       "4" = "Non-Hispanic Black",
                       "5" = "Other Race - Including Multi-Racial"),
         education = recode(dmdeduc2,
                            "1" = "Less than 9th grade",
                            "2" = "9-11th grade",
                            "3" = "High school graduate or equivalent",
                            "4" = "Some college or AA degree",
                            "5" = "College graduate or above",
                            "7" = NA_character_,
                            "9" = NA_character_),
         marriage = recode(dmdmartl,
                           "1" = "Married", 
                           "2" = "Widowed",
                           "3" = "Divorced",
                           "4" = "Separated",
                           "5" = "Never married",
                           "6" = "Living with partner",
                           "77" = NA_character_,
                           "99" = NA_character_)
         ) |>
  mutate(samesexcontact = case_when(
    (MSM %in% c(0, 7777, 77777,9999, 99999) | WSW %in% c(0, 7777, 77777, 9999, 99999)) ~ 0,
    (MSM >= 1 & MSM <= 600) | (WSW >= 1 & WSW <= 600) ~ 1
  )) |>
  select(seqn, hiv, age, gender, race, education, marriage, year, samesexcontact)
cleaned_joined_df |>
  group_by(samesexcontact, hiv) |>
  summarize(n = n())
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## # A tibble: 9 × 3
## # Groups:   samesexcontact [3]
##   samesexcontact hiv              n
##            <dbl> <chr>        <int>
## 1              0 Non-reactive  6826
## 2              0 Reactive        33
## 3              0 <NA>           137
## 4              1 Non-reactive   572
## 5              1 Reactive        48
## 6              1 <NA>            15
## 7             NA Non-reactive 15589
## 8             NA Reactive        43
## 9             NA <NA>          1136

Visualization - 1 Total number as y-axis value Across race

cleaned_joined_df %>% 
  filter(hiv == "Reactive") %>% 
  group_by(race, gender) %>%
  summarize(n = n(), age_mean = round(mean(age), 2)) %>% 
  ungroup() %>% 
  mutate(race = fct_reorder(race, n)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~race, y = ~n, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Race"),
      yaxis = list(title = "Total Number of HIV Positive Patients"),
      title = "The Total Number of HIV Positive Patients from 1999 to 2016 Across Races"
    )
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.

Visualization - 1-1 Total percent as y-axis value

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
    ) %>% 
  group_by(race, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(race = fct_reorder(race, hiv_percent)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~race, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Race"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Races"
    )
## `summarise()` has grouped output by 'race'. You can override using the
## `.groups` argument.

Visualization 2: by year

cleaned_joined_df %>% 
  mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>% 
  group_by(year, gender) %>% 
  summarize(age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~year, y = ~hiv_percent, color = ~gender, type = "scatter", mode = "lines+markers",
          text = ~text_label, colors = "viridis") %>% 
  layout(
      xaxis = list(title = "Year"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016"
    )
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Visualization 3: Across Education level

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)
    ) %>% 
  group_by(education, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade", 
                                         "High school graduate or equivalent", 
                                         "Some college or AA degree",
                                         "College graduate or above"))) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~education, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Education level"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Education Level"
    )
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations

Visualization 4: Across marital status

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv)) %>% 
  group_by(marriage, gender) %>%
  summarize(total_par = n(), age_mean = round(mean(age), 2), 
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>%
  ungroup() %>% 
  mutate(marriage = fct_reorder(marriage, hiv_percent)) %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~marriage, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Marital Status"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Marital Status"
    )
## `summarise()` has grouped output by 'marriage'. You can override using the
## `.groups` argument.
## Warning: Ignoring 2 observations

Visualization 5: Sexual behavior

cleaned_joined_df %>% 
   mutate(hiv = ifelse(is.na(hiv), "NA", hiv),
          samesexcontact = recode(samesexcontact, "0" = "No", 
                                  "1" = "Yes")) %>% 
  group_by(samesexcontact, gender) %>% 
  summarize(total_par = n(), age_mean = round(mean(age), 2), hiv_num = sum(hiv == "Reactive"),
            hiv_percent = round(sum(hiv == "Reactive") / sum(hiv != "Reactive") * 100, 3)) %>% 
  ungroup() %>% 
  mutate(text_label = str_c("mean age: ", age_mean)) %>% 
  plot_ly(x = ~samesexcontact, y = ~hiv_percent, color = ~gender, type = "bar", text = ~text_label, colors = "viridis") %>% 
    layout(
      xaxis = list(title = "Same Sex Sexual Behavior"),
      yaxis = list(title = "Total Percent of HIV Positive Patients"),
      title = "The Total Percent of HIV Positive Patients from 1999 to 2016 Across Sexual Behavior"
    )
## `summarise()` has grouped output by 'samesexcontact'. You can override using
## the `.groups` argument.
## Warning: Ignoring 2 observations

logistic regression

set the reference group

cleaned_regression_df= 
   cleaned_joined_df |> 
   mutate(
      hiv_outcome = ifelse(hiv == "Reactive", 1, ifelse(hiv == "Non-reactive", 0, NA)),
      education = forcats::fct_relevel(education, c("Less than 9th grade", "9-11th grade", "High school graduate or equivalent", "Some college or AA degree", "College graduate or above")),
      race = forcats::fct_relevel(race, c("Non-Hispanic White", "Mexican American", "Non-Hispanic Black", "Other Hispanic", "Other Race - Including Multi-Racial")),
      marriage = forcats::fct_relevel(marriage, c("Married", "Widowed", "Divorced", "Separated", "Never married", "Living with partner"))
         )

This is the full model

fit_alt = cleaned_regression_df |> 
  glm(hiv_outcome ~ samesexcontact + gender + age + race + education + marriage, data = _, family = binomial())

test samesexcontact, gender, age

fit_alt|> 
  broom::tidy() |> 
  mutate(OR = exp(estimate)) |> 
  select(term, estimate, OR, p.value)
## # A tibble: 17 × 4
##    term                                        estimate           OR  p.value
##    <chr>                                          <dbl>        <dbl>    <dbl>
##  1 (Intercept)                                 -10.1     0.0000403   3.88e-24
##  2 samesexcontact                                2.95   19.1         8.25e-26
##  3 genderMale                                    1.90    6.67        6.25e-10
##  4 age                                           0.0583  1.06        1.55e- 4
##  5 raceMexican American                          0.295   1.34        5.38e- 1
##  6 raceNon-Hispanic Black                        1.97    7.17        3.25e- 9
##  7 raceOther Hispanic                            1.08    2.96        2.45e- 2
##  8 raceOther Race - Including Multi-Racial     -14.3     0.000000618 9.78e- 1
##  9 education9-11th grade                        -0.384   0.681       5.76e- 1
## 10 educationHigh school graduate or equivalent  -0.360   0.698       5.97e- 1
## 11 educationSome college or AA degree           -0.304   0.738       6.48e- 1
## 12 educationCollege graduate or above           -1.11    0.328       1.24e- 1
## 13 marriageWidowed                               1.99    7.34        7.29e- 2
## 14 marriageDivorced                              1.12    3.08        4.02e- 2
## 15 marriageSeparated                             1.39    4.01        2.23e- 2
## 16 marriageNever married                         1.37    3.95        1.40e- 3
## 17 marriageLiving with partner                   1.08    2.93        3.36e- 2

test whether race is significant

fit_no_race = 
  glm(hiv_outcome ~ samesexcontact + gender + age + education + marriage, data = na.omit(cleaned_regression_df), family = binomial())

anova(fit_no_race, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
##   term                    df.residual residual.deviance    df deviance   p.value
##   <chr>                         <dbl>             <dbl> <dbl>    <dbl>     <dbl>
## 1 hiv_outcome ~ samesexc…        7243              639.    NA     NA   NA       
## 2 hiv_outcome ~ samesexc…        7239              581.     4     58.4  6.26e-12

test whether education is significant

fit_no_education = 
  glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = na.omit(cleaned_regression_df), family = binomial())

anova(fit_no_education, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
##   term                      df.residual residual.deviance    df deviance p.value
##   <chr>                           <dbl>             <dbl> <dbl>    <dbl>   <dbl>
## 1 hiv_outcome ~ samesexcon…        7243              587.    NA    NA     NA    
## 2 hiv_outcome ~ samesexcon…        7239              581.     4     5.65   0.227

test whether marriage is significant

fit_no_marriage = 
  glm(hiv_outcome ~ samesexcontact + gender + age + race + education, data = na.omit(cleaned_regression_df), family = binomial())

anova(fit_no_marriage, fit_alt, test = 'LRT') |> broom::tidy()
## # A tibble: 2 × 6
##   term                      df.residual residual.deviance    df deviance p.value
##   <chr>                           <dbl>             <dbl> <dbl>    <dbl>   <dbl>
## 1 hiv_outcome ~ samesexcon…        7244              595.    NA     NA   NA     
## 2 hiv_outcome ~ samesexcon…        7239              581.     5     13.7  0.0174

The final model

fit_regression = cleaned_regression_df |> 
  glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = _, family = binomial()) 

fit_regression|> 
  broom::tidy() |> 
  mutate(OR = exp(estimate)) |> 
  select(term, estimate, OR, p.value)|> 
  knitr::kable(digits = 3)
term estimate OR p.value
(Intercept) -10.460 0.000 0.000
samesexcontact 2.834 17.020 0.000
genderMale 1.820 6.171 0.000
age 0.056 1.058 0.000
raceMexican American 0.455 1.576 0.324
raceNon-Hispanic Black 2.072 7.941 0.000
raceOther Hispanic 1.245 3.473 0.009
raceOther Race - Including Multi-Racial -14.286 0.000 0.978
marriageWidowed 2.036 7.661 0.066
marriageDivorced 1.138 3.121 0.036
marriageSeparated 1.406 4.080 0.020
marriageNever married 1.341 3.823 0.002
marriageLiving with partner 1.033 2.811 0.040

Bootstrap

bootstrap_df =
  cleaned_regression_df |> 
  bootstrap(n = 500) |> 
  mutate(
    models = map(.x = strap, ~glm(hiv_outcome ~ samesexcontact + gender + age + race + marriage, data = .x, family = binomial()) ),
    results = map(models, broom::tidy)) |> 
  select(-strap, -models) |> 
  unnest(results) |> 
  group_by(term) |> 
  mutate(OR = exp(estimate))
## Warning: There were 177 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `models = map(...)`.
## Caused by warning:
## ! glm.fit: fitted probabilities numerically 0 or 1 occurred
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 176 remaining warnings.

Look at the distribution of estimates under repeated sampling

bootstrap_df|> 
  filter(term == "age") |> 
  ggplot(aes(x = estimate))+
  geom_density() +
  facet_wrap(.~ term)  

bootstrap_df|> 
  filter(term != "age") |> 
  ggplot(aes(x = estimate))+
  geom_density() +
  facet_wrap(.~ term)

Compute bootstrap mean OR and 95% CI

bootstrap_df|> 
  summarize(
    mean_OR = mean(OR),
    CI_lower = quantile(OR, 0.025),
    CI_upper = quantile(OR, 0.975)
    )|> 
  knitr::kable(digits = 3)
term mean_OR CI_lower CI_upper
(Intercept) 0.000 0.000 0.000
age 1.060 1.031 1.091
genderMale 7.006 3.357 13.656
marriageDivorced 3.845 0.908 10.485
marriageLiving with partner 3.471 1.018 10.308
marriageNever married 4.657 1.654 13.025
marriageSeparated 5.139 0.751 14.666
marriageWidowed 11.892 0.000 55.766
raceMexican American 1.745 0.513 3.860
raceNon-Hispanic Black 8.788 4.505 16.707
raceOther Hispanic 3.894 1.246 9.348
raceOther Race - Including Multi-Racial 0.000 0.000 0.000
samesexcontact 18.716 10.444 32.479